{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "SPAM_PATH = os.path.join('datasets', 'spam')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "HAM_DIR = os.path.join(SPAM_PATH, 'easy_ham')\n",
    "SPAM_DIR = os.path.join(SPAM_PATH, 'spam')\n",
    "ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]\n",
    "spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 我们可以使用python的“email”模块解析这些电子邮件（它处理邮件头、编码等）\n",
    "import email\n",
    "import email.policy\n",
    "\n",
    "def load_email(is_spam, email_name, spam_path=SPAM_PATH):\n",
    "    directory= 'spam' if is_spam else 'easy_ham' # 判断是否是垃圾邮件来选择文件夹\n",
    "    with open(os.path.join(spam_path, directory, email_name), 'rb') as f:\n",
    "        return email.parser.BytesParser(policy=email.policy.default).parse(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Martin A posted:\n",
      "Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the\n",
      " limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the\n",
      " Mount Athos monastic community, was ideal for the patriotic sculpture. \n",
      " \n",
      " As well as Alexander's granite features, 240 ft high and 170 ft wide, a\n",
      " museum, a restored amphitheatre and car park for admiring crowds are\n",
      "planned\n",
      "---------------------\n",
      "So is this mountain limestone or granite?\n",
      "If it's limestone, it'll weather pretty fast.\n",
      "\n",
      "------------------------ Yahoo! Groups Sponsor ---------------------~-->\n",
      "4 DVDs Free +s&p Join Now\n",
      "http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM\n",
      "---------------------------------------------------------------------~->\n",
      "\n",
      "To unsubscribe from this group, send an email to:\n",
      "forteana-unsubscribe@egroups.com\n",
      "\n",
      " \n",
      "\n",
      "Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/\n"
     ]
    }
   ],
   "source": [
    "# 让我们看一个ham示例和一个spam示例，了解数据的外观：\n",
    "ham_emails = [load_email(is_spam=False, email_name=name) for name in ham_filenames]\n",
    "spam_emails = [load_email(is_spam=True, email_name=name) for name in spam_filenames]\n",
    "print(ham_emails[1].get_content().strip())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Help wanted.  We are a 14 year old fortune 500 company, that is\n",
      "growing at a tremendous rate.  We are looking for individuals who\n",
      "want to work from home.\n",
      "\n",
      "This is an opportunity to make an excellent income.  No experience\n",
      "is required.  We will train you.\n",
      "\n",
      "So if you are looking to be employed from home with a career that has\n",
      "vast opportunities, then go:\n",
      "\n",
      "http://www.basetel.com/wealthnow\n",
      "\n",
      "We are looking for energetic and self motivated people.  If that is you\n",
      "than click on the link and fill out the form, and one of our\n",
      "employement specialist will contact you.\n",
      "\n",
      "To be removed from our link simple go to:\n",
      "\n",
      "http://www.basetel.com/remove.html\n",
      "\n",
      "\n",
      "4139vOLW7-758DoDY1425FRhM1-764SMFc8513fCsLl40\n"
     ]
    }
   ],
   "source": [
    "print(spam_emails[6].get_content().strip())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Help wanted.  We are a 14 year old fortune 500 company, that is\n",
      "growing at a tremendous rate.  We are looking for individuals who\n",
      "want to work from home.\n",
      "\n",
      "This is an opportunity to make an excellent income.  No experience\n",
      "is required.  We will train you.\n",
      "\n",
      "So if you are looking to be employed from home with a career that has\n",
      "vast opportunities, then go:\n",
      "\n",
      "http://www.basetel.com/wealthnow\n",
      "\n",
      "We are looking for energetic and self motivated people.  If that is you\n",
      "than click on the link and fill out the form, and one of our\n",
      "employement specialist will contact you.\n",
      "\n",
      "To be removed from our link simple go to:\n",
      "\n",
      "http://www.basetel.com/remove.html\n",
      "\n",
      "\n",
      "4139vOLW7-758DoDY1425FRhM1-764SMFc8513fCsLl40\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(spam_emails[6].get_content())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 电子邮件实际上有很多部分，带有图像和附件（它们可以有自己的附件）。查看邮件的各种类型的结构：\n",
    "def get_email_structure(email):\n",
    "    if isinstance(email, str):\n",
    "        return email\n",
    "    payload = email.get_payload()\n",
    "    if isinstance(payload, list):\n",
    "        return 'multipart({})'.format(','.join([\n",
    "            get_email_structure(sub_email)\n",
    "            for sub_email in payload\n",
    "        ]))\n",
    "    else:\n",
    "        return email.get_content_type()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from collections import Counter\n",
    "a = [1,1,3,6,6,5,1,6,4,7,1,2]\n",
    "d = 'aedfgawgwrg'\n",
    "b = Counter()\n",
    "c = b[d]+1\n",
    "c"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def structure_counter(emails):\n",
    "    structures = Counter()\n",
    "    for email in emails:\n",
    "        structure = get_email_structure(email)\n",
    "        structures[structure] += 1\n",
    "    return structures"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('text/plain', 2408),\n",
       " ('multipart(text/plain,application/pgp-signature)', 66),\n",
       " ('multipart(text/plain,text/html)', 8),\n",
       " ('multipart(text/plain,text/plain)', 4),\n",
       " ('multipart(text/plain)', 3),\n",
       " ('multipart(text/plain,application/octet-stream)', 2),\n",
       " ('multipart(text/plain,text/enriched)', 1),\n",
       " ('multipart(text/plain,application/ms-tnef,text/plain)', 1),\n",
       " ('multipart(multipart(text/plain,text/plain,text/plain),application/pgp-signature)',\n",
       "  1),\n",
       " ('multipart(text/plain,video/mng)', 1),\n",
       " ('multipart(text/plain,multipart(text/plain))', 1),\n",
       " ('multipart(text/plain,application/x-pkcs7-signature)', 1),\n",
       " ('multipart(text/plain,multipart(text/plain,text/plain),text/rfc822-headers)',\n",
       "  1),\n",
       " ('multipart(text/plain,multipart(text/plain,text/plain),multipart(multipart(text/plain,application/x-pkcs7-signature)))',\n",
       "  1),\n",
       " ('multipart(text/plain,application/x-java-applet)', 1)]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "structure_counter(ham_emails).most_common()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('text/plain', 218),\n",
       " ('text/html', 183),\n",
       " ('multipart(text/plain,text/html)', 45),\n",
       " ('multipart(text/html)', 20),\n",
       " ('multipart(text/plain)', 19),\n",
       " ('multipart(multipart(text/html))', 5),\n",
       " ('multipart(text/plain,image/jpeg)', 3),\n",
       " ('multipart(text/html,application/octet-stream)', 2),\n",
       " ('multipart(text/plain,application/octet-stream)', 1),\n",
       " ('multipart(text/html,text/plain)', 1),\n",
       " ('multipart(multipart(text/html),application/octet-stream,image/jpeg)', 1),\n",
       " ('multipart(multipart(text/plain,text/html),image/gif)', 1),\n",
       " ('multipart/alternative', 1)]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "structure_counter(spam_emails).most_common()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Return-Path:<12a1mailbot1@web.de>\n",
      "Delivered-To:zzzz@localhost.spamassassin.taint.org\n",
      "Received:from localhost (localhost [127.0.0.1])\tby phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 136B943C32\tfor <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)\n",
      "Received:from mail.webnote.net [193.120.211.219]\tby localhost with POP3 (fetchmail-5.9.0)\tfor zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)\n",
      "Received:from dd_it7 ([210.97.77.167])\tby webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623\tfor <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 13:09:41 +0100\n",
      "From:12a1mailbot1@web.de\n",
      "Received:from r-smtp.korea.com - 203.122.2.197 by dd_it7  with Microsoft SMTPSVC(5.5.1775.675.6);\t Sat, 24 Aug 2002 09:42:10 +0900\n",
      "To:dcek1a1@netsgo.com\n",
      "Subject:Life Insurance - Why Pay More?\n",
      "Date:Wed, 21 Aug 2002 20:31:57 -1600\n",
      "MIME-Version:1.0\n",
      "Message-ID:<0103c1042001882DD_IT7@dd_it7>\n",
      "Content-Type:text/html; charset=\"iso-8859-1\"\n",
      "Content-Transfer-Encoding:quoted-printable\n"
     ]
    }
   ],
   "source": [
    "for head, value in spam_emails[0].items():\n",
    "    print(head+':'+value)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Life Insurance - Why Pay More?'"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "spam_emails[0]['Subject']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 拆分训练集和测试集合"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "X = np.array(ham_emails + spam_emails)\n",
    "y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 首先需要一个函数来将html转换为纯文本，使用[Beautifulsoup]库，下面的函数首先删除`<head>`部分，然后将所有`<a>`标记转换为单词hyperlink，然后去掉所有html标记，只留下纯文本。为了可读性，它还用一个换行符替换多个换行符，最后它取消了HTML实体（例如`&gt；`或`&nbsp；`）\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "from html import unescape\n",
    "\n",
    "def html_to_plain_text(html):\n",
    "    text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)\n",
    "    text = re.sub('<a\\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)\n",
    "    text = re.sub('<.*?>', '', text, flags=re.M | re.S)\n",
    "    text = re.sub(r'(\\s*\\n)+', '\\n', text, flags=re.M | re.S)\n",
    "    return unescape(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<HTML><HEAD><TITLE></TITLE><META http-equiv=\"Content-Type\" content=\"text/html; charset=windows-1252\"><STYLE>A:link {TEX-DECORATION: none}A:active {TEXT-DECORATION: none}A:visited {TEXT-DECORATION: none}A:hover {COLOR: #0033ff; TEXT-DECORATION: underline}</STYLE><META content=\"MSHTML 6.00.2713.1100\" name=\"GENERATOR\"></HEAD>\n",
      "<BODY text=\"#000000\" vLink=\"#0033ff\" link=\"#0033ff\" bgColor=\"#CCCC99\"><TABLE borderColor=\"#660000\" cellSpacing=\"0\" cellPadding=\"0\" border=\"0\" width=\"100%\"><TR><TD bgColor=\"#CCCC99\" valign=\"top\" colspan=\"2\" height=\"27\">\n",
      "<font size=\"6\" face=\"Arial, Helvetica, sans-serif\" color=\"#660000\">\n",
      "<b>OTC</b></font></TD></TR><TR><TD height=\"2\" bgcolor=\"#6a694f\">\n",
      "<font size=\"5\" face=\"Times New Roman, Times, serif\" color=\"#FFFFFF\">\n",
      "<b>&nbsp;Newsletter</b></font></TD><TD height=\"2\" bgcolor=\"#6a694f\"><div align=\"right\"><font color=\"#FFFFFF\">\n",
      "<b>Discover Tomorrow's Winners&nbsp;</b></font></div></TD></TR><TR><TD height=\"25\" colspan=\"2\" bgcolor=\"#CCCC99\"><table width=\"100%\" border=\"0\"  ...\n"
     ]
    }
   ],
   "source": [
    "html_spam_emals = [email for email in X_train[y_train==1] if get_email_structure(email)=='text/html']\n",
    "sample_html_spam = html_spam_emals[7]\n",
    "print(sample_html_spam.get_content().strip()[:1000], \"...\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "OTC\n",
      " Newsletter\n",
      "Discover Tomorrow's Winners \n",
      "For Immediate Release\n",
      "Cal-Bay (Stock Symbol: CBYI)\n",
      "Watch for analyst \"Strong Buy Recommendations\" and several advisory newsletters picking CBYI.  CBYI has filed to be traded on the OTCBB, share prices historically INCREASE when companies get listed on this larger trading exchange. CBYI is trading around 25 cents and should skyrocket to $2.66 - $3.25 a share in the near future.\n",
      "Put CBYI on your watch list, acquire a position TODAY.\n",
      "REASONS TO INVEST IN CBYI\n",
      "A profitable company and is on track to beat ALL earnings estimates!\n",
      "One of the FASTEST growing distributors in environmental & safety equipment instruments.\n",
      "Excellent management team, several EXCLUSIVE contracts.  IMPRESSIVE client list including the U.S. Air Force, Anheuser-Busch, Chevron Refining and Mitsubishi Heavy Industries, GE-Energy & Environmental Research.\n",
      "RAPIDLY GROWING INDUSTRY\n",
      "Industry revenues exceed $900 million, estimates indicate that there could be as much as $25 billio ...\n"
     ]
    }
   ],
   "source": [
    "print(html_to_plain_text(sample_html_spam.get_content()).strip()[:1000], \"...\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 编写一个函数，它以电子邮件为输入，并以纯文本形式返回其内容，无论其格式是什么\n",
    "def email_to_text(email):\n",
    "    html = None\n",
    "    for part in email.walk():\n",
    "        ctype = part.get_content_type()\n",
    "        if not ctype in (\"text/plain\", \"text/html\"):\n",
    "            continue\n",
    "        try:\n",
    "            content = part.get_content()\n",
    "        except: # 解决编码问题\n",
    "            content = str(part.get_payload())\n",
    "        if ctype == \"text/plain\":\n",
    "            return content\n",
    "        else:\n",
    "            html = content\n",
    "    if html:\n",
    "        return html_to_plain_text(html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<HTML><HEAD><TITLE></TITLE><META http-equiv=\"Content-Type\" content=\"text/html; charset=windows-1252\"><STYLE>A:link {TEX-DECORATION: none}A:active {TEXT-DECORATION: none}A:visited {TEXT-DECORATION: none}A:hover {COLOR: #0033ff; TEXT-DECORATION: underline}</STYLE><META content=\"MSHTML 6.00.2713.1100\" name=\"GENERATOR\"></HEAD>\n",
      "<BODY text=\"#000000\" vLink=\"#0033ff\" link=\"#0033ff\" bgColor=\"#CCCC99\"><TABLE borderColor=\"#660000\" cellSpacing=\"0\" cellPadding=\"0\" border=\"0\" width=\"100%\"><TR><TD bgColor=\"#CCCC99\" valign=\"top\" colspan=\"2\" height=\"27\">\n",
      "<font size=\"6\" face=\"Arial, Helvetica, sans-serif\" color=\"#660000\">\n",
      "<b>OTC</b></font></TD></TR><TR><TD height=\"2\" bgcolor=\"#6a694f\">\n",
      "<font size=\"5\" face=\"Times New Roman, Times, serif\" color=\"#FFFFFF\">\n",
      "<b>&nbsp;Newsletter</b></font></TD><TD height=\"2\" bgcolor=\"#6a694f\"><div align=\"right\"><font color=\"#FFFFFF\">\n",
      "<b>Discover Tomorrow's Winners&nbsp;</b></font></div></TD></TR><TR><TD height=\"25\" colspan=\"2\" bgcolor=\"#CCCC99\"><table width=\"100%\" border=\"0\" cellspacing=\"0\" cellpadding=\"5\"><tr><td valign=\"middle\" height=\"2\"><table width=\"300\" border=\"1\" cellspacing=\"0\" cellpadding=\"0\" bordercolor=\"#660000\" bgcolor=\"#EBEDCB\" height=\"5\" align=\"center\"><tr><td valign=\"middle\" height=\"8\"><div align=\"center\">\n",
      "<font face=\"Arial, Helvetica, sans-serif\" color=\"#660000\" size=\"5\">\n",
      "<b>For Immediate Release</b></font></div></td></tr></table></td></tr><tr><td><table width=\"100%\" border=\"1\" cellspacing=\"0\" cellpadding=\"8\" align=\"center\" bgcolor=\"#fafaf5\" bordercolor=\"#660000\"><tr><td bgcolor=\"#fafaf5\" valign=\"top\" height=\"453\">\n",
      "<p><font face=\"Verdana, Arial, Helvetica, sans-serif\" size=\"-1\">\n",
      "<b>Cal-Bay (Stock Symbol: CBYI)</b>\n",
      "<br>Watch for analyst \"Strong Buy Recommendations\" and several advisory newsletters picking CBYI.  CBYI has filed to be traded on the OTCBB, share prices historically INCREASE when companies get listed on this larger trading exchange. CBYI is trading around 25 cents and should skyrocket to $2.66 - $3.25 a share in the near future.<br>\n",
      "<b><i>Put CBYI on your watch list, acquire a position TODAY.</i></b></font></p>\n",
      "<p><font face=\"Verdana, Arial, Helvetica, sans-serif\" size=\"-1\">\n",
      "<b>REASONS TO INVEST IN CBYI</b></font>\n",
      "<li><font face=\"Verdana, Arial, Helvetica, sans-serif\" size=\"-1\">\n",
      "A profitable company and is on track to beat ALL earnings estimates!</font>\n",
      "<li><font face=\"Verdana, Arial, Helvetica, sans-serif\" size=\"-1\">\n",
      "One of the FASTEST growing distributors in environmental &amp; safety equipment instruments.</font>\n",
      "<li><font face=\"Verdana, Arial, Helvetica, sans-serif\" color=\"#000000\" size=\"-1\">\n",
      "Excellent management team, several EXCLUSIVE contracts.  IMPRESSIVE client list including the U.S. Air Force, Anheuser-Busch, Chevron Refining and Mitsubishi Heavy Industries, GE-Energy &amp; Environmental Research.</font>\n",
      "<p><font face=\"Verdana, Arial, Helvetica, sans-serif\" size=\"-1\">\n",
      "<b>RAPIDLY GROWING INDUSTRY</b>\n",
      "<br>Industry revenues exceed $900 million, estimates indicate that there could be as much as $25 billion from \"smell technology\" by the end of 2003.</font></p>\n",
      "<p><font face=\"Verdana, Arial, Helvetica, sans-serif\" size=\"-1\">\n",
      "<b>!!!!!CONGRATULATIONS!!!!!</b><br>Our last recommendation to buy ORBT at $1.29 rallied and is holding steady at $3.50! Congratulations to all our subscribers that took advantage of this recommendation.</font></p><br><br><br><br><br><br><br><br>\n",
      "<p align=\"center\"><font size=\"-1\" face=\"Verdana, Arial, Helvetica, sans-serif\" color=\"#660000\">\n",
      "ALL removes HONORED. Please allow 7 days to be removed and send ALL addresses to:\n",
      "<font color=#ff0000>\n",
      "<a href=\"mailto:goneforgood@btamail.net.cn\">GoneForGood@btamail.net.cn\n",
      "</a></font></p></td></tr></table><table width=\"100%\" border=\"1\" cellspacing=\"0\" cellpadding=\"0\" bordercolor=\"#660000\"><tr><td bgcolor=\"#660000\" height=\"8\">&nbsp;</td></tr><tr><td bgcolor=\"#FFFFCC\"><table width=\"100%\" border=\"0\" cellspacing=\"0\" cellpadding=\"4\"><tr><td>\n",
      "<font size=\"-2\" face=\"Verdana, Arial, Helvetica, sans-serif\">\n",
      "Certain statements contained in this news release may be forward-looking statements within the meaning of The Private Securities Litigation Reform Act of 1995. These statements may be identified by such terms as \"expect\", \"believe\", \"may\", \"will\", and \"intend\" or similar terms. We are NOT a registered investment advisor or a broker dealer. This is NOT an offer to buy or sell securities. No recommendation that the securities of the companies profiled should be purchased, sold or held by individuals or entities that learn of the profiled companies. We were paid $27,000 in cash by a third party to publish this report. Investing in companies profiled is high-risk and use of this information is for reading purposes only. If anyone decides to act as an investor, then it will be that investor's sole risk. Investors are advised NOT to invest without the proper advisement from an attorney or a registered financial broker. Do not rely solely on the information presented, do additional independent research to form your own opinion and decision regarding investing in the profiled companies. Be advised that the purchase of such high-risk securities may result in the loss of your entire investment.  Not intended for recipients or residents of CA,CO,CT,DE,ID, IL,IA,LA,MO,NV,NC,OK,OH,PA,RI,TN,VA,WA,WV,WI. Void where prohibited.  The owners of this publication may already own free trading shares in CBYI and may immediately sell all or a portion of these shares into the open market at or about the time this report is published.  Factual statements are made as of the date stated and are subject to change without notice.\n",
      "<br>Copyright c 2001</font><hr width=\"100%\" noShade size=\"1\"></td></tr></table></td></tr></table></td></tr></table></TD></TR><TR><TD colspan=\"2\" height=\"31\"><table width=\"100%\" border=\"0\" cellspacing=\"0\" cellpadding=\"4\"><tr><td width=\"87%\" height=\"21\"><div align=\"right\">\n",
      "<font color=\"#666666\">&equiv;</font></div></td><td width=\"7%\" height=\"21\">\n",
      "<font face=\"Arial, Helvetica, sans-serif\" size=\"6\" color=\"#FFFFFF\">\n",
      "OTC</font></font></td><td width=\"6%\" height=\"21\"><div align=\"left\">\n",
      "<font color=\"#666666\">&equiv;</font></div></td></tr></table></TD></TR></TABLE></BODY></HTML>\n",
      "\n",
      "***\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for x in html_spam_emals[7].walk():\n",
    "    print(x.get_content())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "f:\\python37\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:17: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working\n",
      "  from collections import Mapping, defaultdict\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "having => have\n",
      "buffers => buffer\n",
      "Computing => comput\n",
      "Computed => comput\n",
      "Compute => comput\n",
      "Compulsive => compuls\n"
     ]
    }
   ],
   "source": [
    "# 装自然语言工具包（[nltk]（http://www.nltk.org/）\n",
    "# pip install nltk\n",
    "\n",
    "# 用“url”替换url的方法 \n",
    "# pip install urlextract\n",
    "import nltk\n",
    "from urlextract import URLExtract\n",
    "\n",
    "try:\n",
    "    import nltk\n",
    "\n",
    "    stemmer = nltk.PorterStemmer()\n",
    "    for word in (\"having\", \"buffers\", \"Computing\", \"Computed\", \"Compute\", \"Compulsive\"):\n",
    "        print(word, \"=>\", stemmer.stem(word))\n",
    "except ImportError:\n",
    "    print(\"Error: stemming requires the NLTK module.\")\n",
    "    stemmer = None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 将所有处理整合到一个转换器中，我们将使用它将电子邮件转换为文字计数器。注意，我们使用python的'split（）'方法将句子拆分为单词，该方法使用空格作为单词边界。但例如，汉语和日语脚本通常不在单词之间使用空格在这个练习中没关系，因为数据集（主要）是英文的，中文可以使用结巴分词来进行拆分"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.base import BaseEstimator, TransformerMixin\n",
    "\n",
    "class EmailToWordCounterTransformer(BaseEstimator,TransformerMixin):\n",
    "    def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,\n",
    "                  replace_urls=True, replace_numbers=True, stemming = True):\n",
    "        self.strip_headers = strip_headers\n",
    "        self.lower_case = lower_case\n",
    "        self.remove_punctuation = remove_punctuation\n",
    "        self.replace_urls = replace_urls\n",
    "        self.replace_numbers = replace_numbers\n",
    "        self.stemming = stemming\n",
    "    def fit(self, X, y=None):\n",
    "        return self\n",
    "    def transform(self, X, y = None):\n",
    "        X_transformed = []\n",
    "        for email in X:\n",
    "            text = email_to_text(email) or \"\"\n",
    "            if self.lower_case: # 单词转小写\n",
    "                text = text.lower()\n",
    "            if self.replace_urls:  # 替换url\n",
    "                extractor = URLExtract()\n",
    "                urls = list(set(extractor.find_urls(text)))\n",
    "                urls.sort(key=lambda url: len(url),reverse=True)\n",
    "                for url in urls:\n",
    "                    text = text.replace(url, \"URL\")\n",
    "            if self.remove_punctuation:  # 去掉标点符号\n",
    "                text = re.sub(r'\\W+', ' ',text, flags=re.M)\n",
    "            if self.replace_numbers:\n",
    "                text = re.sub(r'\\d+', 'number', text,flags=re.M)\n",
    "            word_counts = Counter(text.split())\n",
    "            if self.stemming and stemmer is not None:\n",
    "                stemmed_word_counts = Counter()\n",
    "                for word, count in word_counts.items():\n",
    "                    stemmed_word = stemmer.stem(word)\n",
    "                    stemmed_word_counts[stemmed_word] += count\n",
    "                word_counts = stemmed_word_counts\n",
    "            X_transformed.append(word_counts)\n",
    "        return np.array(X_transformed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([Counter({'chuck': 1, 'murcko': 1, 'wrote': 1, 'stuff': 1, 'yawn': 1, 'r': 1}),\n",
       "       Counter({'the': 11, 'of': 9, 'and': 8, 'all': 3, 'christian': 3, 'to': 3, 'by': 3, 'jefferson': 2, 'i': 2, 'have': 2, 'superstit': 2, 'one': 2, 'on': 2, 'been': 2, 'ha': 2, 'half': 2, 'rogueri': 2, 'teach': 2, 'jesu': 2, 'some': 1, 'interest': 1, 'quot': 1, 'url': 1, 'thoma': 1, 'examin': 1, 'known': 1, 'word': 1, 'do': 1, 'not': 1, 'find': 1, 'in': 1, 'our': 1, 'particular': 1, 'redeem': 1, 'featur': 1, 'they': 1, 'are': 1, 'alik': 1, 'found': 1, 'fabl': 1, 'mytholog': 1, 'million': 1, 'innoc': 1, 'men': 1, 'women': 1, 'children': 1, 'sinc': 1, 'introduct': 1, 'burnt': 1, 'tortur': 1, 'fine': 1, 'imprison': 1, 'what': 1, 'effect': 1, 'thi': 1, 'coercion': 1, 'make': 1, 'world': 1, 'fool': 1, 'other': 1, 'hypocrit': 1, 'support': 1, 'error': 1, 'over': 1, 'earth': 1, 'six': 1, 'histor': 1, 'american': 1, 'john': 1, 'e': 1, 'remsburg': 1, 'letter': 1, 'william': 1, 'short': 1, 'again': 1, 'becom': 1, 'most': 1, 'pervert': 1, 'system': 1, 'that': 1, 'ever': 1, 'shone': 1, 'man': 1, 'absurd': 1, 'untruth': 1, 'were': 1, 'perpetr': 1, 'upon': 1, 'a': 1, 'larg': 1, 'band': 1, 'dupe': 1, 'import': 1, 'led': 1, 'paul': 1, 'first': 1, 'great': 1, 'corrupt': 1}),\n",
       "       Counter({'url': 4, 's': 3, 'group': 3, 'to': 3, 'in': 2, 'forteana': 2, 'martin': 2, 'an': 2, 'and': 2, 'we': 2, 'is': 2, 'yahoo': 2, 'unsubscrib': 2, 'y': 1, 'adamson': 1, 'wrote': 1, 'for': 1, 'altern': 1, 'rather': 1, 'more': 1, 'factual': 1, 'base': 1, 'rundown': 1, 'on': 1, 'hamza': 1, 'career': 1, 'includ': 1, 'hi': 1, 'belief': 1, 'that': 1, 'all': 1, 'non': 1, 'muslim': 1, 'yemen': 1, 'should': 1, 'be': 1, 'murder': 1, 'outright': 1, 'know': 1, 'how': 1, 'unbias': 1, 'memri': 1, 'don': 1, 't': 1, 'html': 1, 'rob': 1, 'sponsor': 1, 'number': 1, 'dvd': 1, 'free': 1, 'p': 1, 'join': 1, 'now': 1, 'from': 1, 'thi': 1, 'send': 1, 'email': 1, 'egroup': 1, 'com': 1, 'your': 1, 'use': 1, 'of': 1, 'subject': 1})],\n",
       "      dtype=object)"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_few = X_train[:3]\n",
    "X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(X_few)\n",
    "X_few_wordcounts"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 有了单词计数，我们需要把它们转换成向量。为此，我们将构建另一个转换器，其“fit（）”方法将构建词汇表（最常用单词的有序列表），其“transform（）”方法将使用词汇表将单词计数转换为向量--稀疏矩阵"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy.sparse import csr_matrix\n",
    "\n",
    "class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):\n",
    "    def __init__(self, vocabulary_size=1000):\n",
    "        self.vocabulary_size = vocabulary_size\n",
    "    def fit(self, X, y =None):\n",
    "        total_count = Counter()\n",
    "        for word_count in X:\n",
    "            for word, count in word_count.items():\n",
    "                total_count[word] += min(count)\n",
    "        most_common = total_count.most_common()[:self.vocabulary_size]\n",
    "        self.most_common = most_common\n",
    "        self.vocabulary_ = {word: index +1 for index, (word,count) in enumerate(most_common)}\n",
    "        return self\n",
    "    def transform(self, X, y = None):\n",
    "        rows = []\n",
    "        cols = []\n",
    "        data = []\n",
    "        for row, word_count in enumerate(X):\n",
    "            for word, count in word_count.items():\n",
    "                rows.append(row) # 训练集 实例个数\n",
    "                cols.append(self.vocabulary_.get(word, 0)) # 取得单词在词汇表中的索引位置，0代表未出现在词汇表中\n",
    "                data.append(count)\n",
    "        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1)) # 输出稀疏矩阵 +1因为第一列要显示未出现在词汇表中的单词统计数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "ename": "TypeError",
     "evalue": "'<' not supported between instances of 'int' and '_UnstructuredHeader'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-90-8583dfd7d346>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      9\u001b[0m \u001b[1;31m# team.toarray()\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     10\u001b[0m \u001b[0mword\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mWordCounterToVectorTransformer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 11\u001b[1;33m \u001b[0mword\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit_transform\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;36m3\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32mf:\\python37\\lib\\site-packages\\sklearn\\base.py\u001b[0m in \u001b[0;36mfit_transform\u001b[1;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[0;32m    515\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0my\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    516\u001b[0m             \u001b[1;31m# fit method of arity 1 (unsupervised transformation)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 517\u001b[1;33m             \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    518\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    519\u001b[0m             \u001b[1;31m# fit method of arity 2 (supervised transformation)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m<ipython-input-38-7be25082bb95>\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m      8\u001b[0m         \u001b[1;32mfor\u001b[0m \u001b[0mword_count\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      9\u001b[0m             \u001b[1;32mfor\u001b[0m \u001b[0mword\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcount\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mword_count\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 10\u001b[1;33m                 \u001b[0mtotal_count\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mword\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[0mmin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcount\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m10\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     11\u001b[0m         \u001b[0mmost_common\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtotal_count\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmost_common\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvocabulary_size\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     12\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmost_common_\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmost_common\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mTypeError\u001b[0m: '<' not supported between instances of 'int' and '_UnstructuredHeader'"
     ]
    }
   ],
   "source": [
    "\n",
    "# from scipy.sparse import *\n",
    " \n",
    "# row =  [0,0,0,1,1,1,2,2,2]#行指标\n",
    "# col =  [0,1,2,0,1,2,0,1,2]#列指标\n",
    "# data = [1,0,1,0,1,1,1,1,0]#在行指标列指标下的数字\n",
    "# team = csr_matrix((data,(row,col)),shape=(3,3))\n",
    "# print(team)\n",
    "# print(team.todense())\n",
    "# team.toarray()\n",
    "word = WordCounterToVectorTransformer()\n",
    "word.fit_transform(X[:3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3, 11)"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)\n",
    "X_few_vectors = vocab_transformer.fit_transform(X_few_wordcounts)\n",
    "X_few_vectors.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],\n",
       "       [99, 11,  9,  8,  3,  1,  3,  1,  3,  2,  3],\n",
       "       [67,  0,  1,  2,  3,  4,  1,  2,  0,  1,  0]], dtype=int32)"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_few_vectors.toarray()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 第三行第一列中的67表示第三封电子邮件包含64个不属于词汇表的单词。旁边的1表示词汇表中'of'单词在此电子邮件中出现一次。旁边的2表示'and'单词出现两次,'the'没有出现"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'the': 1,\n",
       " 'of': 2,\n",
       " 'and': 3,\n",
       " 'to': 4,\n",
       " 'url': 5,\n",
       " 'all': 6,\n",
       " 'in': 7,\n",
       " 'christian': 8,\n",
       " 'on': 9,\n",
       " 'by': 10}"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab_transformer.vocabulary_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 我们现在准备训练我们的第一个垃圾邮件分类器！让我们转换整个数据集："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-92-efcf71d26cfc>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      6\u001b[0m ])\n\u001b[0;32m      7\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 8\u001b[1;33m \u001b[0mX_train_transformed\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpreprocess_pipeline\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit_transform\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32mf:\\python37\\lib\\site-packages\\sklearn\\pipeline.py\u001b[0m in \u001b[0;36mfit_transform\u001b[1;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[0;32m    279\u001b[0m         \"\"\"\n\u001b[0;32m    280\u001b[0m         \u001b[0mlast_step\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_final_estimator\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 281\u001b[1;33m         \u001b[0mXt\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfit_params\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_fit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    282\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlast_step\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'fit_transform'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    283\u001b[0m             \u001b[1;32mreturn\u001b[0m \u001b[0mlast_step\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit_transform\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mXt\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mf:\\python37\\lib\\site-packages\\sklearn\\pipeline.py\u001b[0m in \u001b[0;36m_fit\u001b[1;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[0;32m    211\u001b[0m                 Xt, fitted_transformer = fit_transform_one_cached(\n\u001b[0;32m    212\u001b[0m                     \u001b[0mcloned_transformer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mXt\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 213\u001b[1;33m                     **fit_params_steps[name])\n\u001b[0m\u001b[0;32m    214\u001b[0m                 \u001b[1;31m# Replace the transformer of the step with the fitted\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    215\u001b[0m                 \u001b[1;31m# transformer. This is necessary when loading the transformer\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mf:\\python37\\lib\\site-packages\\sklearn\\externals\\joblib\\memory.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m    360\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    361\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m__call__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 362\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    363\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    364\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mcall_and_shelve\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mf:\\python37\\lib\\site-packages\\sklearn\\pipeline.py\u001b[0m in \u001b[0;36m_fit_transform_one\u001b[1;34m(transformer, weight, X, y, **fit_params)\u001b[0m\n\u001b[0;32m    579\u001b[0m                        **fit_params):\n\u001b[0;32m    580\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtransformer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'fit_transform'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 581\u001b[1;33m         \u001b[0mres\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtransformer\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit_transform\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    582\u001b[0m     \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    583\u001b[0m         \u001b[0mres\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtransformer\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mf:\\python37\\lib\\site-packages\\sklearn\\base.py\u001b[0m in \u001b[0;36mfit_transform\u001b[1;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[0;32m    515\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0my\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    516\u001b[0m             \u001b[1;31m# fit method of arity 1 (unsupervised transformation)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 517\u001b[1;33m             \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    518\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    519\u001b[0m             \u001b[1;31m# fit method of arity 2 (supervised transformation)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m<ipython-input-36-7b83b12e0165>\u001b[0m in \u001b[0;36mtransform\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m     19\u001b[0m                 \u001b[0mtext\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtext\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     20\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreplace_urls\u001b[0m\u001b[1;33m:\u001b[0m  \u001b[1;31m# 替换url\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 21\u001b[1;33m                 \u001b[0mextractor\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mURLExtract\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     22\u001b[0m                 \u001b[0murls\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mset\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mextractor\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_urls\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     23\u001b[0m                 \u001b[0murls\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msort\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mlambda\u001b[0m \u001b[0murl\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mreverse\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mf:\\python37\\lib\\site-packages\\urlextract\\urlextract_core.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, extract_email, **kwargs)\u001b[0m\n\u001b[0;32m     78\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     79\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_tlds_re\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 80\u001b[1;33m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_reload_tlds_from_file\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     81\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_extract_email\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mextract_email\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     82\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mf:\\python37\\lib\\site-packages\\urlextract\\urlextract_core.py\u001b[0m in \u001b[0;36m_reload_tlds_from_file\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    117\u001b[0m         \"\"\"\n\u001b[0;32m    118\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 119\u001b[1;33m         \u001b[0mtlds\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msorted\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_load_cached_tlds\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mreverse\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    120\u001b[0m         \u001b[0mtlds\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_ipv4_tld\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    121\u001b[0m         \u001b[0mre_escaped\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mre\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mescape\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtld\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mtld\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mtlds\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mf:\\python37\\lib\\site-packages\\urlextract\\cachefile.py\u001b[0m in \u001b[0;36m_load_cached_tlds\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    221\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    222\u001b[0m                 \u001b[0mset_of_tlds\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0madd\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\".\"\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0mtld\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 223\u001b[1;33m                 \u001b[0mset_of_tlds\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0madd\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\".\"\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0midna\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdecode\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtld\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    224\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    225\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[0mset_of_tlds\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mf:\\python37\\lib\\site-packages\\idna\\core.py\u001b[0m in \u001b[0;36mdecode\u001b[1;34m(s, strict, uts46, std3_rules)\u001b[0m\n\u001b[0;32m    387\u001b[0m         \u001b[0mtrailing_dot\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    388\u001b[0m     \u001b[1;32mfor\u001b[0m \u001b[0mlabel\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mlabels\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 389\u001b[1;33m         \u001b[0ms\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mulabel\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlabel\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    390\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0ms\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    391\u001b[0m             \u001b[0mresult\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ms\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mf:\\python37\\lib\\site-packages\\idna\\core.py\u001b[0m in \u001b[0;36mulabel\u001b[1;34m(label)\u001b[0m\n\u001b[0;32m    302\u001b[0m         \u001b[0mlabel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlabel\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_alabel_prefix\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    303\u001b[0m     \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 304\u001b[1;33m         \u001b[0mcheck_label\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlabel\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    305\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[0mlabel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdecode\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'ascii'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    306\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mf:\\python37\\lib\\site-packages\\idna\\core.py\u001b[0m in \u001b[0;36mcheck_label\u001b[1;34m(label)\u001b[0m\n\u001b[0;32m    261\u001b[0m             \u001b[1;32mraise\u001b[0m \u001b[0mInvalidCodepoint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Codepoint {0} at position {1} of {2} not allowed'\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_unot\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcp_value\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpos\u001b[0m\u001b[1;33m+\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrepr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlabel\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    262\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 263\u001b[1;33m     \u001b[0mcheck_bidi\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlabel\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    264\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    265\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mf:\\python37\\lib\\site-packages\\idna\\core.py\u001b[0m in \u001b[0;36mcheck_bidi\u001b[1;34m(label, check_ltr)\u001b[0m\n\u001b[0;32m     70\u001b[0m     \u001b[0mbidi_label\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mFalse\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     71\u001b[0m     \u001b[1;32mfor\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0midx\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcp\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlabel\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 72\u001b[1;33m         \u001b[0mdirection\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0municodedata\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbidirectional\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcp\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     73\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mdirection\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m''\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     74\u001b[0m             \u001b[1;31m# String likely comes from a newer version of Unicode\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "from sklearn.pipeline import Pipeline\n",
    "\n",
    "preprocess_pipeline = Pipeline([\n",
    "    (\"email_to_wordcount\", EmailToWordCounterTransformer()),\n",
    "    (\"wordcount_to_vector\", WordCounterToVectorTransformer()),\n",
    "])\n",
    "\n",
    "X_train_transformed = preprocess_pipeline.fit_transform(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[CV]  ................................................................\n",
      "[CV] .................................. , score=0.98125, total=   0.0s\n",
      "[CV]  ................................................................\n",
      "[CV] .................................... , score=0.985, total=   0.0s\n",
      "[CV]  ................................................................\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s\n",
      "[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[CV] .................................. , score=0.99125, total=   0.2s\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.4s finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.9858333333333333"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import cross_val_score\n",
    "\n",
    "log_clf = LogisticRegression(solver=\"liblinear\", random_state=42) # 采用逻辑回归分类器\n",
    "score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3, verbose=3)\n",
    "score.mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 得到分数超过98.7%，可以尝试多个模型，选择最好的模型，并使用交叉验证对它们进行微调。在测试集上得到的精度/召回率："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "精度: 96.88%\n",
      "召回: 97.89%\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import precision_score, recall_score\n",
    "\n",
    "X_test_transformed = preprocess_pipeline.transform(X_test)\n",
    "\n",
    "log_clf = LogisticRegression(solver=\"liblinear\", random_state=42)\n",
    "log_clf.fit(X_train_transformed, y_train)\n",
    "\n",
    "y_pred = log_clf.predict(X_test_transformed)\n",
    "\n",
    "print(\"精度: {:.2f}%\".format(100 * precision_score(y_test, y_pred)))\n",
    "print(\"召回: {:.2f}%\".format(100 * recall_score(y_test, y_pred)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 总结\n",
    "1. 加载数据并纵观数据大局\n",
    "2. 获取邮件的组成结构\n",
    "3. 对结构类型进行分析 发现垃圾邮件大多有HTML结构\n",
    "4. 数据清洗，定义email对象中的HTML转换称纯文本方法\n",
    "5. 对数据集拆分成训练集和测试集\n",
    "6. 数据处理转换，对邮件的文本内容进行分词处理，通过nltk进行词干提取，对邮件出现的词汇进行计数统计，对所有邮件统计出了一个词汇表\n",
    "7. 通过词汇表和邮件单词计数统计，将单词计数转化成向量矩阵\n",
    "8. 把数据清洗和数据处理封装成两个转换器\n",
    "9. 通过流水线来自动化处理数据\n",
    "10. 使用逻辑回归线性分类器进行模型训练\n",
    "11. 使用交叉验证进行微调\n",
    "12. 在测试集上得到精度/召回率"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[CV]  ................................................................\n",
      "[CV] ....................... , score=0.9402985074626866, total=   0.1s\n",
      "[CV]  ................................................................\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[CV] .................................... , score=0.955, total=   0.1s\n",
      "[CV]  ................................................................\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[CV] ....................... , score=0.9597989949748744, total=   0.1s\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.6s finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.9516991674791869"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "X_test_transformed = preprocess_pipeline.fit_transform(X_test)\n",
    "forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)\n",
    "score = cross_val_score(forest_clf, X_test_transformed, y_test, cv=3, verbose=3)\n",
    "score.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=5, error_score='raise',\n",
       "       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
       "            max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
       "            min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "            min_samples_leaf=1, min_samples_split=2,\n",
       "            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,\n",
       "            oob_score=False, random_state=42, verbose=0, warm_start=False),\n",
       "       fit_params=None, iid=True, n_jobs=1,\n",
       "       param_grid=[{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}],\n",
       "       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n",
       "       scoring='neg_mean_squared_error', verbose=0)"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.model_selection import GridSearchCV\n",
    "param_grid = [\n",
    "    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},\n",
    "    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},\n",
    "  ]\n",
    "\n",
    "grid_search = GridSearchCV(forest_clf, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)\n",
    "grid_search.fit(X_train_transformed, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.27156951228000537 {'max_features': 2, 'n_estimators': 3}\n",
      "0.24238399287081644 {'max_features': 2, 'n_estimators': 10}\n",
      "0.2345207879911715 {'max_features': 2, 'n_estimators': 30}\n",
      "0.2614064523559687 {'max_features': 4, 'n_estimators': 3}\n",
      "0.23094010767585033 {'max_features': 4, 'n_estimators': 10}\n",
      "0.21505813167606566 {'max_features': 4, 'n_estimators': 30}\n",
      "0.23003622903070436 {'max_features': 6, 'n_estimators': 3}\n",
      "0.1925703334715224 {'max_features': 6, 'n_estimators': 10}\n",
      "0.1755942292142123 {'max_features': 6, 'n_estimators': 30}\n",
      "0.23892118644719085 {'max_features': 8, 'n_estimators': 3}\n",
      "0.19578900207451216 {'max_features': 8, 'n_estimators': 10}\n",
      "0.17320508075688773 {'max_features': 8, 'n_estimators': 30}\n",
      "0.24664414311581237 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}\n",
      "0.22453655975512465 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}\n",
      "0.257390753524675 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}\n",
      "0.2217355782608345 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}\n",
      "0.26614532371118854 {'bootstrap': False, 'max_features': 4, 'n_estimators': 3}\n",
      "0.22267315359812312 {'bootstrap': False, 'max_features': 4, 'n_estimators': 10}\n"
     ]
    }
   ],
   "source": [
    "cvres = grid_search.cv_results_\n",
    "for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):\n",
    "    print(np.sqrt(-mean_score), params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "ename": "TypeError",
     "evalue": "'csr_matrix' object is not callable",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-96-fa1b62c47cb1>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[0msome_labels\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0my_test\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;36m5\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 4\u001b[1;33m \u001b[0msome_data_prepared\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mX_train_transformed\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msome_data\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      5\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Predictions:\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mforest_clf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msome_data_prepared\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mTypeError\u001b[0m: 'csr_matrix' object is not callable"
     ]
    }
   ],
   "source": [
    "some_data = X_test[:5]\n",
    "some_labels = y_test[:5]\n",
    "\n",
    "some_data_prepared = X_train_transformed(some_data)\n",
    "print(\"Predictions:\", forest_clf.predict(some_data_prepared))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([<email.message.EmailMessage object at 0x0000023ECB3FE888>,\n",
       "       <email.message.EmailMessage object at 0x0000023ECAF98AC8>,\n",
       "       <email.message.EmailMessage object at 0x0000023ECB40F948>,\n",
       "       <email.message.EmailMessage object at 0x0000023EC9767488>,\n",
       "       <email.message.EmailMessage object at 0x0000023ECB6D6DC8>],\n",
       "      dtype=object)"
      ]
     },
     "execution_count": 95,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_test[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
